library(dplyr)
library(ggplot2)
infn = "../data/human.csv"
metadata = read.table(infn, sep=",", header=TRUE)
metadata$condition = ifelse(grepl("Healthy", metadata$What), "HC",
                     ifelse(grepl("NASH", metadata$What), "OSA",
                     ifelse(grepl("Steat", metadata$What), "SS", "NAFLD")))
metadata$condition = as.factor(metadata$condition)
metadata$spiked = ifelse(grepl("Spike", metadata$Spike.In), "spike", "nospike")
metadata$column = ifelse(grepl("ymo", metadata$Column), "zymo",
                  ifelse(grepl("None", metadata$Column), "none", "other"))
metadata$id = paste(metadata$Full.Sample.Name, metadata$column, metadata$spiked, metadata$condition, sep="_")
set.seed(12345)
library(NanoStringQCPro)
rccFiles = paste("../data/rcc/", metadata[,2], sep="")
eset = newRccSet(rccFiles=rccFiles)
## Reading RCC files...
## checkRccSet() messages:
##   Optional featureData cols not found: BarCode, ProbeID, TargetSeq, Species, Comments
##   The following panel housekeeping genes were found: B2M|0, GAPDH|0, RPL19|0, ACTB|0, RPLP0|0
## Warning in .local(rccSet, ...): 
##   Non-standard CodeClass values found in featureData (values currently recognized are "Endogenous", "Housekeeping", "Positive", and "Negative"): Ligation, Endogenous1, SpikeIn)
colnames(eset) = metadata$id

Scale miRNA with known crosstalk

A small set of miRNA have known inflated values; Nanostring has a heuristic to correct for the inflation via the formula using some supplied constants. Here we implement that fix to correct the expression of those miRNA.

correct_miRNA = function(eset) {
  require(dplyr)
  fdat = fData(eset)
  fdat = fdat %>%
    tidyr::separate(GeneName, c("Name", "scalefactor"), sep='\\|',
                    fill="right", remove=FALSE) %>%
    dplyr::mutate(scalefactor=ifelse(is.na(scalefactor), 0, scalefactor))
  sf = as.numeric(fdat$scalefactor)
  posa = as.numeric(exprs(eset)["Positive_POS_A_ERCC_00117.1",])
  scales = t(replicate(length(sf), posa))
  scales = scales * sf
  scaled = exprs(eset) - scales
  scaled[scaled<0] = 0
  return(round(scaled))
}

plot_corrected_miRNA = function(scaled, eset) {
  require(reshape)
  require(ggplot2)
  counts = exprs(eset)
  rownames(counts) = rownames(scaled)
  is_scaled = rowSums(abs(counts - scaled)) > 0
  counts = melt(as.matrix(counts[is_scaled,]))
  colnames(counts) = c("gene", "sample", "value")
  counts$scaled = "no"
  scaled = melt(as.matrix(scaled[is_scaled,]))
  colnames(scaled) = c("gene", "sample", "value")
  scaled$scaled = "yes"
  all = rbind(counts, scaled)
  library(cowplot)
  ggplot(all, aes(sample, value, color=scaled)) + facet_wrap(~gene) +
    geom_point(size=0.5) +
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
    guides(colour = guide_legend(override.aes = list(size=6))) + ylab("") +
    ggtitle("miRNA hybridization crosstalk correction")
  }
scaled = correct_miRNA(eset)
plot_corrected_miRNA(scaled, eset)
## Loading required package: reshape
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave

counts = as.data.frame(scaled)
colnames(counts) = metadata$id

Imaging QC

This percentage should be super high, close to 100%. If there isn’t that usually means the cartridge needs to be rescanned. If it is < 3 weeks from the scan, rescanning should be okay. Nanostring folks call < 75% a failure. These all look fine.

library(cowplot)
pdat = pData(eset) %>%
  tibble::rownames_to_column() %>%
  left_join(metadata, by=c("rowname"="id"))
pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
ggplot(pdat, aes(rowname, pcounted)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  scale_y_continuous(expand = c(0,0)) +
  expand_limits(y = c(0,1.05 * max(pdat$pcounted))) +
  ylab("percentage of FOV counted") + xlab("sample") +
  geom_hline(yintercept=75, color="red")

Binding density

Binding density looks ok.

ggplot(pdat, aes(rowname, BindingDensity)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  scale_y_continuous(expand = c(0,0)) +
  expand_limits(y = c(0,1.05 * max(pdat$BindingDensity))) +
  ylab("Binding density") + xlab("sample") +
  geom_hline(yintercept=0.05, color="red") +
  geom_hline(yintercept=2.25, color="red")

Total counts vs miRNA detected

Here I plotted the total counts vs the number of miRNA detected, where detected is it has counts > 30. There is a pretty big spread among the samples in terms of the miRNA detected. The Zymo columns seem to all have a low number of miRNA detected.

endocounts = counts[grepl("Endo", rownames(counts)),]
cdf = data.frame(total=colSums(counts), detected=colSums(counts > 30))
rownames(cdf) = colnames(counts)
cdf$id = rownames(cdf)
cdf = cdf %>% left_join(metadata, by="id")
ggplot(cdf, aes(total, detected, color=column, label=condition)) +
  geom_text()

library(ggplot2)
library(dplyr)
library(cowplot)
is_positive = function(column) {
  return(grepl("Pos", column))
}
is_negative = function(column) {
  return(grepl("Neg", column))
}
is_spikein = function(column) {
  return(grepl("Spike", column))
}
is_ligation = function(column) {
  return(grepl("Ligati", column))
}
is_housekeeping = function(column) {
  return(grepl("Housekee", column))
}
is_prior = function(column) {
  return(grepl("miR-159", column) | grepl("miR-248", column) |
         grepl("miR-254", column))
}

extract_pred = function(counts, predicate) {
  toplot = counts[predicate(rownames(counts)),] %>%
    tibble::rownames_to_column() %>%
    tidyr::gather("sample", "count", -rowname)
  colnames(toplot) = c("spot", "sample", "count")
  toplot = toplot %>% left_join(metadata, by=c("sample"="id"))
  return(toplot)
}
spotbarplot = function(toplot) {
  ggplot(toplot,
        aes(sample, count)) + geom_bar(stat='identity') +
    facet_wrap(~spot) +
    theme(axis.text.x = element_blank(),
          text = element_text(size=8))
}
spotboxplot = function(toplot) {
  ggplot(toplot,
        aes(linehypo, count)) + geom_boxplot() +
    facet_wrap(~spot) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

Positive controls

Below we look at the R^2 correlation between the expected positive control concentrations and the observed concentrations for each sample.

A set of samples have a lower correlation than the other samples.

spotbarplot(extract_pred(counts, is_positive))

pcdf = data.frame(concentration=log2(c(128, 32, 8, 2, 0.5, 0.125)),
                  GeneName=paste("POS", c("A", "B", "C", "D", "E", "F"), sep="_"))
pccounts = subset(exprs(eset), grepl("Positive_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
                                          function(x) cor(x, pcdf$concentration)),
                        sample=colnames(pccounts)) %>%
  left_join(metadata, by=c("sample"="id"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
ggplot(corsamples, aes(sample, correlation)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  scale_y_continuous(expand = c(0,0)) +
  expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
  ylab("positive control correlation") +
  xlab("sample")

Those are all Zymo columns. 8 of the 10 Zymo columns have this problem. If it looks like they have other issues, it might be best to just exclude them.

subset(corsamples, correlation < 0.80)$sample
## [1] "179ON_zymo_spike_OSA" "258ON_zymo_spike_OSA" "035ON_zymo_spike_OSA"
## [4] "448ON_zymo_spike_OSA" "048SS_zymo_spike_SS"  "037ss_zymo_spike_SS" 
## [7] "163SS_zymo_spike_SS"  "114SS_zymo_spike_SS"

Negative controls

We can see some samples have a higher negative control count than the other samples.

spotbarplot(extract_pred(counts, is_negative))

knitr::kable(subset(extract_pred(counts, is_negative), count > 50))
spot sample count File.Name FULL.RCC.File.Name Pure.Sample.Name Full.Sample.Name Clinical.Category What Column Spike.In Notes Date.Shipped RCC.File.Date condition spiked column
NA
17 Negative_NEG_C_ERCC_00019.1 T1N_none_nospike_HC 75 ..03 20150421_PHS 1_01_03.RCC T1 T1N Healthy Control Healthy control 1 -vial 1 None None optimization 4.6.15 4.21.2015 HC nospike none
25 Negative_NEG_C_ERCC_00019.1 T1A_other_nospike_HC 84 ..04 20150421_PHS 1_01_04.RCC T1 T1A Healthy Control Healthy control 1 -vial 1 Amicon None optimization 4.6.15 4.21.2015 HC nospike other
NA
NA
NA
NA

Noise floor cutoff

We’ll normalize the libraries and look at the negative control expression and then come up with a cutoff that is above the noise floor for the libraries. It looks like 30 is a reasonable noise floor to use.

drop_unusable = function(counts) {
  drop = is_spikein(rownames(counts))
  drop = drop | is_positive(rownames(counts))
  drop = drop | is_housekeeping(rownames(counts))
  drop = drop | is_ligation(rownames(counts))
  keep = counts[!drop,]
  keep = keep[, !grepl("Blank", colnames(keep))]
  return(keep)
}
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:reshape':
## 
##     expand, rename
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, regroup, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
dds = DESeqDataSetFromMatrix(drop_unusable(counts), colData=metadata,
                             design=~spiked+column+condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = estimateSizeFactors(dds)
ncounts = data.frame(counts(dds, normalized=TRUE))
ggplot(extract_pred(ncounts, is_negative), aes(count)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

nfloor = 30

SpikeIn

We don’t have many spike in sets in this sample either, so we can’t really use the spike ins.

spotbarplot(extract_pred(counts, is_spikein))

knitr::kable(subset(extract_pred(counts, is_spikein), count > 100))
spot sample count File.Name FULL.RCC.File.Name Pure.Sample.Name Full.Sample.Name Clinical.Category What Column Spike.In Notes Date.Shipped RCC.File.Date condition spiked column
NA
113 SpikeIn_cel-miR-254|0_MIMAT0000310 A1_none_spike_HC 305 ..07 20160114_ES011416miRNAV3a_Stock sampleA1_07.RCC A1 A1 Healthy Control Healthy control 4 - vial 1 None +Spike In optimization 1.4.16 1.14.16 HC spike none
118 SpikeIn_cel-miR-254|0_MIMAT0000310 A10_zymo_spike_HC 201 ..08 20160114_ES011416miRNAV3a_Stock sampleA2O_08.RCC A10 A10 Healthy Control Healthy control 4 -vial 10 Zymo +Spike In optimization 1.4.16 1.14.16 HC spike zymo

Ligation

Some of the ligation controls are drastically different between samples.

spotbarplot(extract_pred(counts, is_ligation))

Ligation control R^2

Here we look at R^2 for the ligation controls as well. It looks like one sample looks bad. It is a Zymo column.

pcdf = data.frame(concentration=log2(c(128, 32, 8)),
                  GeneName=paste("POS_", c("A", "B", "C"), sep="_"))
pccounts = subset(exprs(eset), grepl("Ligation_LIG_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
                                          function(x) cor(x, pcdf$concentration)),
                        sample=colnames(pccounts)) %>%
  left_join(metadata, by=c("sample"="id"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
ggplot(corsamples, aes(sample, correlation, color=column)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  scale_y_continuous(expand = c(0,0)) +
  expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
  ylab("ligation control correlation") +
  xlab("sample")

Ligation efficiency

Here we calculated ligation efficiency for the samples. This is the log of LIG_POS_A(i) / mean(LIG_POS_A) for each sample i. It doesn’t matter which direction this is in, it will get corrected for in the model. Again we can see most of the Zymo columns are outliers in this.

lignorm = function(counts, feature="Ligation_LIG_POS_A") {
  ligcounts = as.numeric(counts[grepl(feature, rownames(counts)),])
  lnorm = log2((ligcounts + 1) / mean(ligcounts))
  names(lnorm) = colnames(counts)
  return(lnorm)
}
metadata$lignorm = lignorm(counts)
ggplot(metadata, aes(id, lignorm, color=column)) + geom_point() +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          strip.text.x=element_text(size=8)) +
  ylab("ligation factor") +
  xlab("sample")

Housekeeping

Some housekeeping genes are expressed more highly in some samples. Not sure what we are supposed to do with this information, if there are any suggestions we could easily implement it.

spotbarplot(extract_pred(counts, is_housekeeping))

Drop control miRNA

drop_unusable = function(counts) {
  drop = is_spikein(rownames(counts))
  drop = drop | is_positive(rownames(counts))
  drop = drop | is_housekeeping(rownames(counts))
  drop = drop | is_ligation(rownames(counts))
  keep = counts[!drop,]
  keep = keep[, !grepl("Blank", colnames(keep))]
  return(keep)}
counts = drop_unusable(counts)
metadata = subset(metadata, id %in% colnames(counts))

Does normalization for ligation effiency buy us anything?

Here we fit two models. The first model we fit the full model, including the ligation normalization term in the model. ~month+column+line+hypoxia+lignorm. Then we fit a reduced model without the lignorm term to answer the question ‘is there miRNA that are affected by the ligation efficiency?’

Yes, it looks like even after normalization, if we fit a model with just the ligation efficiency that there are miRNA affected. So we should correct for that. Here you can see only a subset of the miRNA are affected; if we do it like this instead of just scaling the counts, we can correct for the specific miRNA that have counts that are correlated with the ligation efficiency.

full = ~spiked+column+lignorm+condition
reduced = ~spiked+column+condition
dds = DESeqDataSetFromMatrix(counts, colData=metadata,
                             design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds, full=full, reduced=reduced, test="LRT")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res = results(dds)
plotMA(res)

res = data.frame(res) %>% tibble::rownames_to_column() %>%
  arrange(padj) %>% filter(padj < 0.05)
knitr::kable(subset(res, padj < 0.05))
rowname baseMean log2FoldChange lfcSE stat pvalue padj
Endogenous1_hsa-miR-3144-3p|0.004_MIMAT0015015 16.154394 -0.9027776 0.7035563 24.617774 0.0000007 0.0001418
Endogenous1_hsa-miR-520f-3p|0_MIMAT0002830 22.485535 0.2807007 0.3115951 25.039419 0.0000006 0.0001418
Endogenous1_hsa-miR-3065-5p|0.004_MIMAT0015066 15.168313 -0.7369420 0.7020276 24.318401 0.0000008 0.0001418
Endogenous1_hsa-miR-1246|0.005_MIMAT0005898 116.492298 -1.2478732 1.0380751 20.863521 0.0000049 0.0005572
Endogenous1_hsa-miR-128-1-5p|0_MIMAT0026477 12.508796 0.5570126 0.3300654 20.687951 0.0000054 0.0005572
Endogenous1_hsa-miR-455-5p|0_MIMAT0003150 14.969105 0.0845011 0.3722611 20.359475 0.0000064 0.0005572
Endogenous1_hsa-miR-5010-3p|0_MIMAT0021044 17.040506 0.0620533 0.4116074 19.903198 0.0000081 0.0006063
Endogenous1_hsa-miR-302d-3p|0.006_MIMAT0000718 25.431722 -0.9682754 0.9945260 19.132930 0.0000122 0.0007940
Endogenous1_hsa-miR-494-3p|0.099_MIMAT0002816 223.791055 3.6463676 2.3874095 17.931783 0.0000229 0.0013255
Endogenous1_hsa-miR-369-3p|0_MIMAT0000721 21.623242 0.4177668 0.3211437 16.698079 0.0000438 0.0022833
Endogenous1_hsa-miR-652-5p|0_MIMAT0022709 9.151777 -0.1158272 0.4449402 15.872030 0.0000678 0.0032100
Endogenous1_hsa-miR-4536-3p|0_MIMAT0020959 16.773122 0.3668208 0.3519415 15.640717 0.0000766 0.0033252
Endogenous1_hsa-miR-301a-3p|0_MIMAT0000688 17.751937 0.1110325 0.3778992 14.534513 0.0001376 0.0055152
Endogenous1_hsa-miR-574-5p|0_MIMAT0004795 28.637916 0.0585036 0.3650492 14.110083 0.0001724 0.0064164
Endogenous1_hsa-miR-137|0_MIMAT0000429 18.632380 0.7748520 0.3287320 13.656989 0.0002194 0.0076213
Endogenous1_hsa-miR-4485-3p|0_MIMAT0019019 16.053723 0.2896278 0.3477983 13.495628 0.0002391 0.0077863
Endogenous1_hsa-miR-526b-5p|0_MIMAT0002835 18.250371 0.5268700 0.3932662 12.731418 0.0003596 0.0110196
Endogenous1_hsa-miR-449b-5p|0_MIMAT0003327 17.239821 0.5104381 0.3297762 12.596221 0.0003865 0.0111878
Endogenous1_hsa-miR-532-3p|0_MIMAT0004780 20.808011 0.3421212 0.3604880 12.254842 0.0004641 0.0121706
Endogenous1_hsa-miR-302a-3p|0_MIMAT0000684 15.481374 0.2836165 0.4152721 12.242215 0.0004672 0.0121706
Endogenous1_hsa-miR-1228-3p|0_MIMAT0005583 20.384874 0.6922482 0.3517373 12.127399 0.0004969 0.0123270
Endogenous1_hsa-miR-508-5p|0_MIMAT0004778 15.773746 0.4023408 0.3453795 11.915263 0.0005568 0.0126118
Endogenous1_hsa-miR-5001-5p|0_MIMAT0021021 17.611257 0.4878175 0.3718612 11.930862 0.0005521 0.0126118
Negative_NEG_G_ERCC_00144.1 14.468579 0.6237302 0.3667778 11.794852 0.0005939 0.0128936
Endogenous1_hsa-miR-2682-5p|0_MIMAT0013517 21.076116 -0.3665701 0.3835922 11.650844 0.0006417 0.0133737
Endogenous1_hsa-miR-3613-3p|0_MIMAT0017991 17.943783 0.3191609 0.3342182 11.308388 0.0007716 0.0148886
Endogenous1_hsa-miR-199a-5p|0_MIMAT0000231 20.873884 0.2023784 0.3238180 11.342755 0.0007574 0.0148886
Endogenous1_hsa-miR-184|0_MIMAT0000454 19.362627 0.6129380 0.3778002 11.102818 0.0008620 0.0154857
Endogenous1_hsa-miR-6720-3p|0_MIMAT0025851 12.246648 0.4268708 0.4125961 11.137908 0.0008458 0.0154857
Endogenous1_hsa-miR-525-5p|0_MIMAT0002838 24.296625 0.4786314 0.3128564 10.909131 0.0009569 0.0166184
Endogenous1_hsa-miR-30c-5p|0_MIMAT0000244 16.472750 -0.0428976 0.3374711 10.499707 0.0011939 0.0172821
Endogenous1_hsa-miR-373-3p|0_MIMAT0000726 14.094550 -0.1613932 0.3455254 10.513527 0.0011850 0.0172821
Endogenous1_hsa-miR-338-5p|0_MIMAT0004701 16.282134 0.3563631 0.3716245 10.504735 0.0011907 0.0172821
Endogenous1_hsa-miR-522-3p|0_MIMAT0002868 21.037217 0.5356803 0.3474974 10.499366 0.0011942 0.0172821
Endogenous1_hsa-miR-1268b|0_MIMAT0018925 21.962312 0.3599242 0.2869007 10.674698 0.0010861 0.0172821
Endogenous1_hsa-miR-487b-3p|0_MIMAT0003180 11.091303 0.4837423 0.4062637 10.616317 0.0011209 0.0172821
Endogenous1_hsa-miR-543|0_MIMAT0004954 18.524290 0.6713847 0.3044068 10.352965 0.0012927 0.0181361
Endogenous1_hsa-miR-27b-3p|0_MIMAT0000419 11.442824 0.4726154 0.3916176 10.310460 0.0013228 0.0181361
Endogenous1_hsa-miR-4286|0_MIMAT0016916 21.179109 0.4028977 0.3281633 10.107572 0.0014766 0.0197261
Endogenous1_hsa-miR-197-5p|0_MIMAT0022691 14.781771 0.4860745 0.3292944 9.994336 0.0015702 0.0204522
Endogenous1_hsa-miR-34c-3p|0_MIMAT0004677 20.676202 0.0706558 0.2792556 9.758553 0.0017849 0.0221413
Endogenous1_hsa-miR-421|0_MIMAT0003339 17.491647 0.1690349 0.3423553 9.779140 0.0017650 0.0221413
Endogenous1_hsa-miR-4647|0_MIMAT0019709 18.236624 0.4144892 0.3341588 9.707018 0.0018357 0.0222413
Endogenous1_hsa-miR-381-3p|0_MIMAT0000736 13.695175 -0.0460788 0.3540132 9.506826 0.0020471 0.0242394
Endogenous1_hsa-miR-3614-5p|0_MIMAT0017992 19.228691 0.5946727 0.3416812 9.309471 0.0022797 0.0263941
Endogenous1_hsa-miR-572|0_MIMAT0003237 13.253703 0.0329145 0.3920173 9.225713 0.0023864 0.0270285
Endogenous1_hsa-miR-211-5p|0_MIMAT0000268 19.204195 0.7713678 0.3405613 9.168648 0.0024620 0.0272911
Endogenous1_hsa-miR-382-5p|0_MIMAT0000737 14.964241 -0.4628502 0.3591842 8.994089 0.0027085 0.0289324
Endogenous1_hsa-miR-6503-5p|0_MIMAT0025462 14.494007 0.7332092 0.3466901 8.985646 0.0027211 0.0289324
Endogenous1_hsa-miR-1206|0_MIMAT0005870 16.661578 -0.1488565 0.3389593 8.881072 0.0028814 0.0292747
Endogenous1_hsa-miR-3161|0_MIMAT0015035 8.770930 0.0764910 0.4115240 8.856163 0.0029210 0.0292747
Endogenous1_hsa-miR-10a-5p|0_MIMAT0000253 19.430113 0.5751018 0.3130499 8.855635 0.0029219 0.0292747
Endogenous1_hsa-miR-891a-5p|0_MIMAT0004902 22.442521 0.3218974 0.3422574 8.750903 0.0030945 0.0304194
Endogenous1_hsa-miR-33b-5p|0_MIMAT0003301 21.862140 0.6661927 0.3448169 8.714991 0.0031560 0.0304499
Endogenous1_hsa-miR-155-5p|0_MIMAT0000646 30.834213 -0.4000943 0.3877161 8.519452 0.0035137 0.0321097
Endogenous1_hsa-miR-3136-5p|0_MIMAT0015003 11.347316 0.3925598 0.3490501 8.457093 0.0036362 0.0321097
Endogenous1_hsa-miR-922|0_MIMAT0004972 17.087206 0.1018960 0.3583880 8.576458 0.0034054 0.0321097
Endogenous1_hsa-miR-183-5p|0_MIMAT0000261 24.081003 0.7381933 0.3084478 8.481537 0.0035877 0.0321097
Endogenous1_hsa-miR-2113|0_MIMAT0009206 10.063128 0.5908673 0.4090225 8.502772 0.0035461 0.0321097
Endogenous1_hsa-miR-1266-5p|0_MIMAT0005920 16.066163 0.2193273 0.2980987 8.297277 0.0039705 0.0344248
Endogenous1_hsa-miR-512-5p|0_MIMAT0002822 16.558368 0.4495301 0.3449292 8.197171 0.0041956 0.0344248
Endogenous1_hsa-miR-548e-5p|0_MIMAT0026736 17.029038 0.1914604 0.3225284 8.201139 0.0041864 0.0344248
Endogenous1_hsa-miR-187-3p|0_MIMAT0000262 16.403869 0.4397486 0.3330983 8.182877 0.0042288 0.0344248
Endogenous1_hsa-miR-767-5p|0_MIMAT0003882 16.911194 0.3655126 0.3676737 8.200679 0.0041875 0.0344248
Endogenous1_hsa-miR-582-3p|0_MIMAT0004797 12.973997 0.5421819 0.3585611 8.124245 0.0043677 0.0344785
Endogenous1_hsa-miR-449a|0_MIMAT0001541 8.114853 0.1930454 0.4756947 8.144047 0.0043203 0.0344785
Endogenous1_hsa-miR-138-5p|0_MIMAT0000430 19.137156 -0.2817369 0.3437245 7.696189 0.0055338 0.0376590
Endogenous1_hsa-miR-423-3p|0_MIMAT0001340 27.126050 -0.3966050 0.3138738 7.404693 0.0065054 0.0376590
Endogenous1_hsa-miR-1307-5p|0_MIMAT0022727 19.934626 0.1644425 0.3349698 7.411452 0.0064810 0.0376590
Endogenous1_hsa-miR-34b-3p|0_MIMAT0004676 16.321653 -0.1801052 0.3385198 7.853464 0.0050723 0.0376590
Endogenous1_hsa-miR-30b-5p|0_MIMAT0000420 25.120072 -0.5503289 0.3473148 7.535839 0.0060484 0.0376590
Endogenous1_hsa-miR-4531|0_MIMAT0019070 18.544776 -0.0995727 0.3249445 7.571689 0.0059292 0.0376590
Endogenous1_hsa-miR-671-5p|0_MIMAT0003880 14.598270 -0.5034353 0.3805830 7.481211 0.0062346 0.0376590
Endogenous1_hsa-miR-337-5p|0_MIMAT0004695 20.455689 0.4918635 0.3039373 7.567834 0.0059419 0.0376590
Endogenous1_hsa-miR-548a-5p|0_MIMAT0004803 27.830500 0.6361400 0.3825574 7.500470 0.0061683 0.0376590
Endogenous1_hsa-miR-132-3p|0_MIMAT0000426 13.169093 0.3280749 0.4226051 7.443532 0.0063665 0.0376590
Endogenous1_hsa-miR-548k|0_MIMAT0005882 14.300429 0.3224224 0.3819228 7.506145 0.0061489 0.0376590
Endogenous1_hsa-miR-3605-5p|0_MIMAT0017981 10.313653 0.3250684 0.4370139 7.422404 0.0064417 0.0376590
Endogenous1_hsa-miR-517c-3p+hsa-miR-519a-3p|0_MIMAT0002866 28.911200 0.2973478 0.3895462 7.485860 0.0062185 0.0376590
Endogenous1_hsa-miR-660-3p|0_MIMAT0022711 19.818373 0.2329490 0.3830516 7.811576 0.0051913 0.0376590
Endogenous1_hsa-miR-551b-3p|0_MIMAT0003233 18.787583 0.1702477 0.3522732 7.649510 0.0056788 0.0376590
Endogenous1_hsa-miR-607|0_MIMAT0003275 20.062353 0.3874095 0.2965702 7.500453 0.0061683 0.0376590
Endogenous1_hsa-miR-1296-3p|0_MIMAT0026637 11.566780 0.9848154 0.3967151 7.647508 0.0056851 0.0376590
Endogenous1_hsa-miR-199b-5p|0_MIMAT0000263 15.288132 0.5409972 0.3429482 7.531401 0.0060633 0.0376590
Endogenous1_hsa-miR-514a-3p|0_MIMAT0002883 20.171994 0.4727561 0.3208240 7.504905 0.0061531 0.0376590
Endogenous1_hsa-miR-335-5p|0_MIMAT0000765 18.744085 0.6214970 0.3538713 7.932888 0.0048544 0.0376590
Endogenous1_hsa-miR-199a-3p+hsa-miR-199b-3p|0_MIMAT0000232 25.579517 -0.5094305 0.4461006 7.453202 0.0063324 0.0376590
Endogenous1_hsa-miR-513a-3p|0_MIMAT0004777 23.417255 0.1990901 0.3463820 7.579395 0.0059039 0.0376590
Endogenous1_hsa-miR-188-3p|0_MIMAT0004613 10.702219 0.6061570 0.3834415 7.447019 0.0063542 0.0376590
Endogenous1_hsa-miR-2116-5p|0_MIMAT0011160 8.801659 0.6553138 0.3957661 7.592225 0.0058621 0.0376590
Endogenous1_hsa-miR-1245a|0_MIMAT0005897 14.623967 0.2202017 0.3881365 7.377185 0.0066056 0.0378192
Endogenous1_hsa-miR-542-3p|0_MIMAT0003389 16.732171 0.5297783 0.3753294 7.323801 0.0068047 0.0381211
Endogenous1_hsa-miR-378d|0_MIMAT0018926 20.172643 0.6257212 0.3385224 7.343099 0.0067321 0.0381211
Endogenous1_hsa-let-7a-5p|0.006_MIMAT0000062 15.413694 0.1996921 2.3777768 7.213052 0.0072375 0.0399564
Endogenous1_hsa-miR-518e-3p|0_MIMAT0002861 9.115780 0.3259874 0.4636558 7.201142 0.0072857 0.0399564
Endogenous1_hsa-miR-628-3p|0_MIMAT0003297 12.523421 -0.4142201 0.3921215 7.141566 0.0075317 0.0401134
Endogenous1_hsa-miR-30e-3p|0_MIMAT0000693 17.391121 0.5948340 0.3273287 7.138333 0.0075453 0.0401134
Endogenous1_hsa-miR-495-5p|0_MIMAT0022924 18.221403 0.5994959 0.3307664 7.147518 0.0075068 0.0401134
Endogenous1_hsa-miR-564|0_MIMAT0003228 25.316992 -0.2625442 0.3523026 6.905362 0.0085938 0.0434694
Endogenous1_hsa-miR-1226-3p|0_MIMAT0005577 16.580422 0.4567798 0.3272328 6.905968 0.0085909 0.0434694
Endogenous1_hsa-miR-515-3p|0_MIMAT0002827 24.116855 -0.1794401 0.2959484 6.946050 0.0084005 0.0434694
Endogenous1_hsa-miR-21-5p|0_MIMAT0000076 24.101615 -1.4589421 0.3837023 6.934158 0.0084565 0.0434694
Endogenous1_hsa-miR-345-3p|0_MIMAT0022698 17.603977 0.5760853 0.3806965 6.914550 0.0085497 0.0434694
Endogenous1_hsa-miR-587|0_MIMAT0003253 21.431597 -0.1268609 0.3514207 6.867602 0.0087772 0.0439706
Endogenous1_hsa-miR-514b-5p|0_MIMAT0015087 26.961500 1.0057010 0.2828945 6.758917 0.0093280 0.0462848
Endogenous1_hsa-miR-216b-5p|0_MIMAT0004959 22.347780 0.0793256 0.3483973 6.694134 0.0096731 0.0470997
Endogenous1_hsa-miR-4787-5p|0_MIMAT0019956 17.134283 -0.4047326 0.3634773 6.710449 0.0095850 0.0470997
Endogenous1_hsa-miR-519d-3p|0_MIMAT0002853 17.130403 -0.3224137 0.3316804 6.628411 0.0100365 0.0484167
Endogenous1_hsa-miR-656-3p|0_MIMAT0003332 20.600164 -0.2671215 0.3247474 6.555701 0.0104549 0.0497452
Endogenous1_hsa-miR-510-3p|0_MIMAT0026613 13.758661 0.9270255 0.3527670 6.543322 0.0105279 0.0497452
Endogenous1_hsa-miR-939-5p|0_MIMAT0004982 20.253974 0.5625802 0.3394075 6.531459 0.0105983 0.0497452
Endogenous1_hsa-miR-1185-5p|0_MIMAT0005798 16.192813 0.8719359 0.3760341 6.509214 0.0107317 0.0499215

PCA

Here we can see the choice of column is what separates out the samples, it is going to be hard to call differences because the column type is confounded with the sample type for the most part. We only have one observation of non-zymo columns for the OSA and SS samples.

dds = DESeqDataSetFromMatrix(counts, colData=metadata, design=full)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
vst = varianceStabilizingTransformation(dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
pca_loadings = function(object, ntop=500) {
  rv <- matrixStats::rowVars(assay(object))
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
      length(rv)))]
  pca <- prcomp(t(assay(object)[select, ]))
  percentVar <- pca$sdev^2/sum(pca$sdev^2)
  names(percentVar) = colnames(pca$x)
  pca$percentVar = percentVar
  return(pca)}
pc = pca_loadings(vst)
comps = data.frame(pc$x)
comps$Name = rownames(comps)
library(dplyr)
comps = comps %>% left_join(metadata, by=c("Name"="id"))
pca_plot = function(comps, nc1, nc2, colorby) {
   c1str = paste0("PC", nc1)
   c2str = paste0("PC", nc2)
  ggplot(comps, aes_string(c1str, c2str, color=colorby)) +
    geom_point() + theme_bw() +
    xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), "% variance")) +
    ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 100), "% variance"))
  }
pca_plot(comps, 1, 2, "condition")

pca_plot(comps, 1, 2, "column")

It isn’t clear what is separating out the samples on the lower left.

Differential expression

write_results = function(res, fname) {
  res = data.frame(res) %>% tibble::rownames_to_column() %>%
    arrange(pvalue)
  write.table(res, file=fname, quote=FALSE, col.names=TRUE,
              row.names=FALSE, sep=",")
}
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds)

OSA/NASH vs HC

res = results(dds, contrast=c("condition", "OSA", "HC"))
plotMA(res)

write_results(res, "OSA-vs-HC.csv")

OSA/NASH vs SS

res = results(dds, contrast=c("condition", "OSA", "SS"))
plotMA(res)

write_results(res, "OSA-vs-SS.csv")

SS vs HC

res = results(dds, contrast=c("condition", "SS", "HC"))
plotMA(res)

write_results(res, "SS-vs-HC.csv")

Output tables

OSA vs HC

OSA vs SS

SS vs HC

5-11-16 only

The 5-11-16 samples are all Zymo columns, all have spike-ins and are two conditions. This is the cleanest set to look at. We’ll subset the data down to these samples and run the analysis. Since these are all spike-ins we can also normalize by the spike in data. We’ll renormalize the ligation data as well.

scaled = correct_miRNA(eset)
counts = as.data.frame(scaled)
colnames(counts) = metadata$id
maymeta = subset(metadata, Date.Shipped == "5.11.16")
maycounts = counts[, maymeta$id]
maymeta$lignorm = lignorm(maycounts)

Spike-in normalization

Here we will calculate normalization values using the formula suggested by the NanoString folks with the spikeins. We calculate the geometric of all of the spike ins and divide that by the average of the geometric mean

colGeo = function(mat) {
  lmat = log2(mat + 1)
  return(colMeans(lmat))
}
spikenormalize = function(counts) {
  counts = counts[is_spikein(rownames(counts)),]
  geo = colGeo(counts)
  return(mean(geo) / geo)
}
maymeta$spikenorm = spikenormalize(maycounts)

Run DESeq2

deseq2resids = function(dds) {
  fitted = t(t(assays(dds)[["mu"]]) / sizeFactors(dds))
  return(counts(dds, normalized=TRUE) - fitted)
}

design = ~lignorm+condition
maycounts = drop_unusable(maycounts)
dds = DESeqDataSetFromMatrix(maycounts, colData=maymeta, design=design)
## converting counts to integer mode
## factor levels were dropped which had no samples
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
maymeta$sizefactor = sizeFactors(dds)
resdnorm = results(dds, addMLE=TRUE)

Spike-in vs DESeq2 style normalization

DESeq2 has a similar normalization scheme to the spike-ins but instead uses all of the genes, not just the spike-ins. For each gene a scaling factor is created and the median of those scaling factors for each sample is the sizeFactor. How does this compare to just using the spike-ins?

ggplot(maymeta, aes(sizefactor, spikenorm)) +
  geom_point() +
  xlab("DESeq2 normalization") +
  ylab("Spike-in normalization")

Not great! We’ll manually set the size factor for these libraries, then.

dds = DESeqDataSetFromMatrix(maycounts, colData=maymeta, design=design)
## converting counts to integer mode
## factor levels were dropped which had no samples
sizeFactors(dds) = maymeta$spikenorm
dds = DESeq(dds, fitType='local')
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ressnorm = results(dds, addMLE=TRUE)

How did that affect the results? It doesn’t seem to affect them much, plotting the p-values and log2 fold changes against each other:

qplot(resdnorm$pvalue, ressnorm$pvalue) + geom_point() +
  xlab("DESeq2 normalization p-value") + ylab("Spike-in normalization p-value")
## Warning: Removed 19 rows containing missing values (geom_point).

## Warning: Removed 19 rows containing missing values (geom_point).

qplot(resdnorm$log2FoldChange, ressnorm$log2FoldChange) + geom_point() +
  xlab("DESeq2 normalization log2FC") + ylab("Spike-in normalization log2FC")
## Warning: Removed 19 rows containing missing values (geom_point).

## Warning: Removed 19 rows containing missing values (geom_point).

But, overall, the size factors were within 10% of each other for all of the samples, so it isn’t surprising that shifting them doesn’t affect the results much. We’ll use the results from the NanoString normalization though.

res = results(dds, contrast=c("condition", "OSA", "SS"), addMLE=TRUE)
plotMA(res)

write_results(res, "may-OSA-vs-SS.csv")

May OSA-vs-SS

Again, we’re not seeing much here. Nothing pops out as significant; I think to do this we need many more samples.

PCA for May samples

Even looking at some of the higher order PCA components, we’re not seeing any convincing separation.

vst = varianceStabilizingTransformation(dds)
pc = pca_loadings(vst)
comps = data.frame(pc$x)
comps$Name = rownames(comps)
comps = comps %>% left_join(maymeta, by=c("Name"="id"))
pca_plot(comps, 1, 2, "condition")

pca_plot(comps, 3, 4, "condition")

pca_plot(comps, 5, 6, "condition")

With more samples, and more metadata about the samples, we could maybe dive into what might be causing the increased variation, but without that we’re mostly sunk.